CMV Omics - Script 2: CMV effects in telomere length, aging markers and blood cell proportions.

R Markdown of second script. Read data of LLD CMV seropositivity predictions (from script 1), telomere length measurements in 6 blood types (Flow-FISH), (blood) methylation age, T-cell maturation (sj-TRECS), and cell counts measured in blood and predicted from blood bulk RNA-seq.

Location_dir = "~/Documents/GitHub/CMV_Associations/"

library(tidyverse)
library(patchwork) #Combine plots
library(lme4) # for mixed-model analysis of CMV-sex interaction
library(regmed) #For multiple mediation
library(MetBrewer) #coloring
library(mediation) #individual mediation
library(readxl)
library(pheatmap)

library(foreach) #paralellize FDR computation
library(doParallel)

source(paste0(Location_dir, "scripts/Functions.R"))
source(paste0(Location_dir, "scripts/Functions_multivariate.R"))
#####################
#LLD CMV prediction##
#####################
Covariates = read_tsv(paste0(Location_dir,"Results/LLD_CMV_profile.tsv")) %>% filter(TimePoint == "Baseline") #1,488 samples

#######################
#predicted cell types #
#######################
all_cell_types = read_tsv(paste0(Location_dir,"Data/BIOS_cell_types_DeconCell_2019-03-08.LLD_subset.txt")) #Cell counts from Blood bulkRNA deconvolution
#Formatting data since we want samples as columns intead of as rows
all_cell_types %>% gather(key = var_name, value = value, 2:ncol(all_cell_types)) %>% spread_(key = names(all_cell_types)[1],value = 'value') %>% rename(ID = var_name) -> all_cell_types #1,180 samples with cell predictions

#######################
# measured cell types##
#######################
Phenotypes <- read_delim(paste0(Location_dir,"Data/Merged_Phenotypes_LLD.csv"), delim="|") 
Phenotypes %>%  dplyr::select("ID", "Lymph", "Eryth", "Mono", "Neutro", "Thrombo", "Eosino","Blood_baso") %>% drop_na() -> cell_counts #1,570 LLD samples with blood cell counts

#######################
# Aging markers #######
#######################

Age_markers <- read_tsv(paste0(Location_dir,"Data/Combined_data_uncorrected.tsv"))
Telomere_markers <- colnames(Age_markers)[grepl("MTL",colnames(Age_markers))]
Telomere_markers = Age_markers %>% dplyr::select(c("Sample",Telomere_markers)) %>% mutate(MTL_gran = as.numeric(MTL_gran), `MTL_CD20+`=as.numeric(`MTL_CD20+`), `MTL_CD45+_CD20-`=as.numeric(`MTL_CD45+_CD20-`),`MTL_CD57+`=as.numeric(`MTL_CD57+`)) %>%
  `colnames<-`(c("ID", "TL_Lymphocytes", "TL_Granulocytes", "TL_Naive_T-cells", "TL_Memory_T-cells","TL_B-cells","TL_NK-cells"))
#Remove samples with all telomere lengths missing
Telomere_markers[rowSums(is.na(Telomere_markers)) != (ncol(Telomere_markers)-1), ] -> Telomere_markers
#Some samples are missing telomeres, we will impute it later. 1,299 samples with telomere lengths

MethylAge = dplyr::select(Age_markers, c(Sample, Methyl.Age.Hannum.,Methyl.Age.Weidner.,Methyl.Age.Horvath.)) %>% rename(ID = Sample) %>% drop_na() #647 LLD samples with methylation age

#Get all markers together
read_tsv( paste0(Location_dir,"Data/Aging_and_health_markers.tsv")) -> Aging_health_markers

First, we will start imputing missing values in the telomere length. Imputation function is included in scripts/Functions.R. Imputation function uses a seed. Then splits the data in 80% training and 20% test. For each cell type, it fits a linear model in the training data, using all other telomere lengths as predictors. R2 is calculated on the predicted data from the test (using caret). Then this is done in the data where there is actually missing values, and the accuracy obtained from the data with complete cases is used as an unbiased estimate.

List_telomere_imputation = Impute_telomere_data(Telomere_markers)
Imputed_Telomere_markers = List_telomere_imputation[[1]] ; Imputed_accuracy = List_telomere_imputation[[2]]
Telomere_markers %>% gather(Cell, TL, 2:7) %>% mutate(Type="Real") -> Real_lengths
Imputed_Telomere_markers %>% gather(Cell, TL, 2:7) %>% mutate(Type="Imputed") -> Imputed_lengths
rbind(Real_lengths, Imputed_lengths) %>% ggplot( aes(x=TL)  ) + theme_bw() + geom_histogram() + facet_grid(Cell~Type) 

Predicted cell counts are proportions based on bulk RNA-seq data. RNA-seq counts are compositional. And thus, the cell proportions also are interdependent between them. To alleviate this interdependency we will use log ratios. I have tested to different log ratios: - We will use Monocytes proportion (`Monocytes (CD14+)) as our reference, and perform ALR on that. - We will use per-sample geometric means as references, and perform CLR. cell_matrix_xlr will be the ratio used for subsequent analyses

Clean_and_ALR(all_cell_types) -> cell_matrix_alr
Clean_and_CLR(all_cell_types) -> cell_matrix_clr
cell_matrix_xlr = cell_matrix_alr #Matrix to use

Correlation of CMV seropositivity with 1. Aging markers, 2. Predicted cell counts, 3. Cell counts. We will use a linear model while controlling for chronological age and sex.

Basic_association = function( DF = Imputed_Telomere_markers , Covariates = Covariates ){
  Results = tibble()
  left_join(Covariates, DF, by="ID") -> DF_association
  for ( Phenotype in colnames(select(DF, -ID)) ){
    DF_association[Phenotype] = scale(DF_association[Phenotype])
    Formula = as.formula( paste0("`", Phenotype, "` ~ Age + Sex + CMV_prediction" )  )
    lm(Formula, DF_association) %>% summary() -> Result_pheno
    N  =length(Result_pheno$residuals)
    Result_pheno$coefficients %>% as.data.frame() %>% rownames_to_column("Regressor") %>% filter(Regressor == "CMV_prediction") %>% mutate(Phenotype = Phenotype, .before=1) %>% select(-Regressor) %>% as_tibble() %>% mutate(N_samples = N) -> Result_pheno
    rbind(Results, Result_pheno) -> Results
  }
  return(Results)
}

Association_telomeres = Basic_association(Imputed_Telomere_markers, Covariates) %>% mutate(Pheno_class = "TL")
Association_Methylation = Basic_association(MethylAge, Covariates)%>% mutate(Pheno_class = "Methyl_Age")
Association_CellPred =  Basic_association(cell_matrix_xlr, Covariates)%>% mutate(Pheno_class = "PredictedCell") 
Association_Cell =  Basic_association(cell_counts, Covariates)%>% mutate(Pheno_class = "CellCounts")

Association_telomeres %>% ggplot(aes(x= Phenotype ,y = Estimate, fill = -log10(`Pr(>|t|)`)  )) + geom_bar(stat="identity") + theme_bw() + coord_flip() + scale_fill_gradientn( colours =  met.brewer(name="VanGogh3", type="continuous")) -> Telomere_association_plot


rbind(rbind(Association_telomeres, Association_CellPred), Association_Cell) -> Association_results

#Plot
Association_results %>% mutate(P.threshold = ifelse(`Pr(>|t|)` < 0.000001, "P<1e-6", ifelse(`Pr(>|t|)` < 0.0001, "1e-6<P<1e-4", ifelse(`Pr(>|t|)` < 0.001, "1e-4<P<1e-3", ifelse(`Pr(>|t|)` < 0.01, "1e-3<P<1e-2", "P>1e-2" )))) ) %>% mutate(P.threshold = factor(P.threshold, levels = c("P>1e-2","1e-3<P<1e-2","1e-4<P<1e-3","1e-6<P<1e-4", "P<1e-6" ) ) )  %>% ggplot(aes(x= Phenotype ,y = Estimate, color = P.threshold )) + geom_point() + theme_bw() + coord_flip() + facet_wrap(~Pheno_class, scales="free_y") + theme(axis.title.y=element_blank(), axis.text.y=element_text(size=8) ) + scale_color_discrete(type = met.brewer(name="Klimt", n=5,type="discrete"))  + geom_errorbar(aes(ymin=Estimate-2*`Std. Error`, ymax=Estimate+2*`Std. Error`), width=.2) + geom_hline(yintercept = 0, linetype='dotted', col = 'grey' ) -> All_associations_plot
#Save
print(Association_results %>% arrange(`Pr(>|t|)`) )
## # A tibble: 45 × 7
##    Phenotype                Estimate Std. Er…¹ t val…² Pr(>|t|…³ N_sam…⁴ Pheno…⁵
##    <chr>                       <dbl>     <dbl>   <dbl>     <dbl>   <int> <chr>  
##  1 CD8+ EM CD45RA- CD27-       1.22     0.0491   24.9  5.66e-110    1173 Predic…
##  2 TL_NK-cells                -0.528    0.0533   -9.90 3.09e- 22    1166 TL     
##  3 CD8+ T cells                0.440    0.0582    7.57 7.39e- 14    1173 Predic…
##  4 Lymph                       0.328    0.0541    6.06 1.78e-  9    1408 CellCo…
##  5 TL_Lymphocytes             -0.292    0.0525   -5.56 3.27e-  8    1166 TL     
##  6 CD4+ Naive CD45RO- CD27+   -0.296    0.0570   -5.19 2.45e-  7    1172 Predic…
##  7 CD4+ Naive CD45RA+ CD27+   -0.252    0.0571   -4.42 1.10e-  5    1172 Predic…
##  8 Granulocytes               -0.252    0.0588   -4.29 1.91e-  5    1173 Predic…
##  9 Prol CD4+ Treg             -0.244    0.0589   -4.14 3.73e-  5    1173 Predic…
## 10 TL_Memory_T-cells          -0.227    0.0565   -4.01 6.43e-  5    1166 TL     
## # … with 35 more rows, and abbreviated variable names ¹​`Std. Error`,
## #   ²​`t value`, ³​`Pr(>|t|)`, ⁴​N_samples, ⁵​Pheno_class
write_tsv(rbind(Association_results, Association_Methylation ) , paste0(Location_dir,"Results/Associations_CMV_Cell_Age.tsv"))


ggsave(paste0(Location_dir,"Results/Figures/CMV_associations.pdf"), All_associations_plot, height = 4, width = 10 )

We observe an association of CMV with cell counts and telomere lengths. We wonder if this associations are age or sex dependent. For that, fitted interaction models and observe if adding an interative term is better than an additive model.

Interaction_association = function( DF = Imputed_Telomere_markers , Covariates = Covariates ){
  Results = tibble()
  left_join(Covariates, DF, by="ID") -> DF_association
  for ( Phenotype in colnames(select(DF, -ID)) ){
    DF_association[Phenotype] = scale(DF_association[Phenotype])
    Formula0 = as.formula( paste0("`", Phenotype, "` ~ Sex + Age + CMV_prediction" )  )
    Formula1 = as.formula( paste0("`", Phenotype, "` ~ Sex + Age * CMV_prediction" )  )
    Formula2 = as.formula( paste0("`", Phenotype, "` ~ Age + Sex  * CMV_prediction" )  )
    #Formula3 = as.formula( paste0("`", Phenotype, "` ~ Sex * Age * CMV_prediction" )  )
    lm(Formula0, DF_association) -> R0
    lm(Formula1, DF_association) -> R1
    lm(Formula2, DF_association) -> R2
    #lm(Formula3, DF_association) -> R3
    
    anova(R0, R1) -> A ; A$`Pr(>F)` -> P1
    anova(R0, R2) -> A ; A$`Pr(>F)` -> P2

    for (i in c(1,2)){
      Model = list(R1, R2)[[i]]
      Model %>% summary() -> Result_pheno
      N  =length(Result_pheno$residuals)
      Result_pheno$coefficients %>% as.data.frame() %>% rownames_to_column("Regressor")  %>% mutate(Phenotype = Phenotype, .before=1)  %>% as_tibble() %>% mutate(N_samples = N) %>% mutate(Model= c("Age-CMV", "Sex-CMV")[i], P_model = c(P1[2], P2[2])[i] ) -> Result_pheno
    rbind(Results, Result_pheno) -> Results
    }
  }
  return(Results)
}

#Telomere interaction
Association_telomeres_interaction = Interaction_association(Imputed_Telomere_markers, Covariates) %>% mutate(Pheno_class = "TL")
Association_telomeres_interaction %>% filter(grepl(":", Regressor)) %>% arrange(`Pr(>|t|)`) %>% print()
## # A tibble: 12 × 10
##    Phenotype      Regre…¹ Estimate Std. …² t val…³ Pr(>|…⁴ N_sam…⁵ Model P_model
##    <chr>          <chr>      <dbl>   <dbl>   <dbl>   <dbl>   <int> <chr>   <dbl>
##  1 TL_NK-cells    Sex:CM… -2.84e-1 0.106   -2.67   0.00769    1166 Sex-… 0.00769
##  2 TL_Granulocyt… Sex:CM… -2.42e-1 0.114   -2.12   0.0340     1166 Sex-… 0.0340 
##  3 TL_Memory_T-c… Sex:CM… -2.18e-1 0.113   -1.93   0.0537     1166 Sex-… 0.0537 
##  4 TL_B-cells     Sex:CM… -1.84e-1 0.111   -1.65   0.0987     1166 Sex-… 0.0987 
##  5 TL_Naive_T-ce… Sex:CM… -1.75e-1 0.106   -1.65   0.0992     1166 Sex-… 0.0992 
##  6 TL_Lymphocytes Sex:CM… -1.52e-1 0.105   -1.45   0.149      1166 Sex-… 0.149  
##  7 TL_Memory_T-c… Age:CM…  5.46e-3 0.00408  1.34   0.181      1166 Age-… 0.181  
##  8 TL_NK-cells    Age:CM… -3.53e-3 0.00385 -0.918  0.359      1166 Age-… 0.359  
##  9 TL_B-cells     Age:CM…  3.54e-3 0.00403  0.878  0.380      1166 Age-… 0.380  
## 10 TL_Granulocyt… Age:CM…  2.61e-3 0.00412  0.633  0.527      1166 Age-… 0.527  
## 11 TL_Naive_T-ce… Age:CM… -3.75e-4 0.00384 -0.0977 0.922      1166 Age-… 0.922  
## 12 TL_Lymphocytes Age:CM… -3.24e-4 0.00379 -0.0854 0.932      1166 Age-… 0.932  
## # … with 1 more variable: Pheno_class <chr>, and abbreviated variable names
## #   ¹​Regressor, ²​`Std. Error`, ³​`t value`, ⁴​`Pr(>|t|)`, ⁵​N_samples
#Predicted Cell interaction 
Association_CellPred_interaction =  Interaction_association(cell_matrix_xlr, Covariates)%>% mutate(Pheno_class = "PredictedCell") 
Association_CellPred_interaction %>% filter(grepl(":", Regressor)) %>% arrange(`Pr(>|t|)`) %>% print()
## # A tibble: 64 × 10
##    Pheno…¹ Regre…² Estim…³ Std. …⁴ t val…⁵ Pr(>|…⁶ N_sam…⁷ Model P_model Pheno…⁸
##    <chr>   <chr>     <dbl>   <dbl>   <dbl>   <dbl>   <int> <chr>   <dbl> <chr>  
##  1 DN (CD… Age:CM… 0.0144  0.00415    3.48 5.25e-4    1173 Age-… 5.25e-4 Predic…
##  2 CD4+ N… Sex:CM… 0.316   0.113      2.80 5.18e-3    1172 Sex-… 5.18e-3 Predic…
##  3 CD4+ N… Sex:CM… 0.295   0.113      2.61 9.17e-3    1172 Sex-… 9.17e-3 Predic…
##  4 B cell… Sex:CM… 0.291   0.119      2.45 1.42e-2    1173 Sex-… 1.42e-2 Predic…
##  5 CD8+ E… Age:CM… 0.00856 0.00355    2.41 1.62e-2    1173 Age-… 1.62e-2 Predic…
##  6 IgD- C… Sex:CM… 0.274   0.114      2.41 1.62e-2    1173 Sex-… 1.62e-2 Predic…
##  7 CD4+ T… Sex:CM… 0.274   0.114      2.41 1.62e-2    1173 Sex-… 1.62e-2 Predic…
##  8 CD45RO… Sex:CM… 0.266   0.112      2.37 1.81e-2    1173 Sex-… 1.81e-2 Predic…
##  9 Prol C… Sex:CM… 0.267   0.114      2.35 1.89e-2    1173 Sex-… 1.89e-2 Predic…
## 10 IgD+ I… Sex:CM… 0.279   0.121      2.31 2.11e-2    1172 Sex-… 2.11e-2 Predic…
## # … with 54 more rows, and abbreviated variable names ¹​Phenotype, ²​Regressor,
## #   ³​Estimate, ⁴​`Std. Error`, ⁵​`t value`, ⁶​`Pr(>|t|)`, ⁷​N_samples, ⁸​Pheno_class
Get_Permutation_FDR = function(Assocation,DF, Cov, Permutations=100, Term ="Sex" ){
  set.seed(87)
  N_perm = Permutations
  
  num_cores <- 4  # Number of cores/workers to be used
  cl <- makeCluster(num_cores)
  registerDoParallel(cl)
  
  Permuted_ps <- foreach(Permutation = 1:N_perm, .combine = c, .packages = c("dplyr","tibble" ),.export = c("Interaction_association") ) %dopar% {
  print(paste0("Permutation:", Permutation))
  Permuted_cmv <- sample(Cov$CMV_prediction, replace = FALSE, size = dim(Cov)[1])
  if (Term == "Sex") {
    Permuted_Sex <- sample(Cov$Sex, replace = FALSE, size = dim(Cov)[1])
    Permuted_cov <- mutate(Cov, CMV_prediction = Permuted_cmv, Sex = Permuted_Sex)
    Permuted <- Interaction_association(DF, Covariates = Permuted_cov)
    P_p <- Permuted %>% filter(Regressor == "Sex:CMV_prediction")
    }
  P_p$`Pr(>|t|)`
  }
  
  stopCluster(cl)

  FDR = Compute_FDR_null(Assocation, Permuted_ps)
  FDR2 = Compute_FDR2(Assocation, Permuted_ps, Permutations)
  return(list(FDR, FDR2))
}
Compute_FDR_null =  function(P, Permutation_results,fdr_threshold = 0.05 ){
  # calculate FDR for each observed P-value
  fdr_values <- sapply(P, function(p) {
    # count number of false positives (i.e., permuted P < observed P)
    false_positives <- sum(Permutation_results <= p)
    # calculate FDR
    fdr <- false_positives / sum(Permutation_results <= fdr_threshold)
    # ensure FDR is between 0 and 1
    fdr <- ifelse(fdr < 0, 0, ifelse(fdr > 1, 1, fdr))
    return(fdr)
  })
  return(fdr_values)
}  
Compute_FDR2 =  function(P, Permutation_results,Permutations = 100 ){
  Order = order(P)
  tibble(Position = Order, FDR = NA) -> values
  fdr_values = c()
  for ( i in seq(P) ){
    Threshold = P[which(Order == i)]
    FDR_v = sum(Permutation_results <= Threshold) / Permutations / i
    values %>% mutate(FDR = ifelse(Position == i, FDR_v, FDR  )) -> values
  }
  return(values$FDR)
}  


#Sex interaction
rbind( Association_telomeres_interaction,Association_CellPred_interaction)   %>% filter(Regressor == "Sex:CMV_prediction" ) %>% select(`Pr(>|t|)`) %>% as_vector() -> Sex_Ps
#Association_telomeres_interaction  %>% filter(Regressor == "Sex:CMV_prediction" ) %>% select(`Pr(>|t|)`) %>% as_vector() -> Sex_Ps


Get_Permutation_FDR(Assocation = Sex_Ps, DF = full_join(Imputed_Telomere_markers,cell_matrix_xlr), Cov = Covariates, Term = "Sex", Permutations = 1000 ) -> FDR_perm 


rbind( Association_telomeres_interaction,Association_CellPred_interaction)   %>% filter(Regressor == "Sex:CMV_prediction" ) %>% mutate(FDR_perm1 = FDR_perm[[1]], FDR_perm2 = FDR_perm[[2]], FDR_bh = p.adjust(`Pr(>|t|)`, "fdr")  ) %>% arrange(FDR_perm2) -> Association_interaction

write_tsv(Association_interaction %>% select(-c(FDR_perm1, FDR_bh)) %>% rename(FDR_p = FDR_perm2) , paste0(Location_dir,"Results/CMV_sex_interaction_TLandCells.tsv"))

We observe some signal in telomere length interaction, and in cell predictions. In order to gain power in telomere length interactions, we will run the model using random effects, so that we can pull all cell types together, and assuming the effect is the same to all cell types. Model: TL ~ CMV + Age + Sex + Cell + Interaction_term + Cell:CMV + (1|ID)

set.seed(895)

left_join( select(Covariates, c("ID", "Sex", "Age", "CMV_prediction")) , Imputed_Telomere_markers, by="ID")  %>%
 gather(Cell, TL ,5:10) -> Long_DF_model
#Center Age
Long_DF_model$Age = Long_DF_model$Age - mean(Long_DF_model$Age[!is.na(Long_DF_model$Age)] )

Long_DF_model %>% lmerTest::lmer(TL ~ CMV_prediction + Age + Sex + Cell + CMV_prediction +  CMV_prediction:Cell + (1|ID), data= ., REML = F ) -> Mixed_model_telomeres_0
Long_DF_model %>% lmerTest::lmer(TL ~ CMV_prediction + Age + Sex + Cell + CMV_prediction:Sex +CMV_prediction:Cell  + (1|ID), data= ., REML = F ) -> Mixed_model_telomeres_sex
Long_DF_model %>% lmer(TL ~ CMV_prediction + Age + Sex + Cell + CMV_prediction:Age + CMV_prediction:Cell + (1|ID), data= . ,REML = F) -> Mixed_model_telomeres_age



print(Mixed_model_telomeres_sex %>% summary())
## Linear mixed model fit by maximum likelihood . t-tests use Satterthwaite's
##   method [lmerModLmerTest]
## Formula: TL ~ CMV_prediction + Age + Sex + Cell + CMV_prediction:Sex +  
##     CMV_prediction:Cell + (1 | ID)
##    Data: .
## 
##      AIC      BIC   logLik deviance df.resid 
##  11213.8  11330.3  -5589.9  11179.8     6979 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -10.5810  -0.4839   0.0073   0.5139   7.0914 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  ID       (Intercept) 0.5916   0.7692  
##  Residual             0.1736   0.4167  
## Number of obs: 6996, groups:  ID, 1166
## 
## Fixed effects:
##                                        Estimate Std. Error         df t value
## (Intercept)                           7.406e+00  9.718e-02  1.216e+03  76.213
## CMV_prediction                        2.277e-01  1.625e-01  1.215e+03   1.402
## Age                                  -2.872e-02  1.713e-03  1.166e+03 -16.767
## Sex                                   3.414e-01  5.849e-02  1.166e+03   5.836
## CellTL_Granulocytes                  -2.140e-01  2.163e-02  5.830e+03  -9.890
## CellTL_Lymphocytes                   -7.320e-01  2.163e-02  5.830e+03 -33.837
## CellTL_Memory_T-cells                -1.339e+00  2.163e-02  5.830e+03 -61.881
## CellTL_Naive_T-cells                  3.669e-02  2.163e-02  5.830e+03   1.696
## CellTL_NK-cells                      -1.733e+00  2.163e-02  5.830e+03 -80.110
## CMV_prediction:Sex                   -2.076e-01  9.729e-02  1.166e+03  -2.134
## CMV_prediction:CellTL_Granulocytes    4.906e-03  3.588e-02  5.830e+03   0.137
## CMV_prediction:CellTL_Lymphocytes    -2.036e-01  3.588e-02  5.830e+03  -5.674
## CMV_prediction:CellTL_Memory_T-cells -1.010e-01  3.588e-02  5.830e+03  -2.814
## CMV_prediction:CellTL_Naive_T-cells  -1.119e-01  3.588e-02  5.830e+03  -3.119
## CMV_prediction:CellTL_NK-cells       -4.692e-01  3.588e-02  5.830e+03 -13.079
##                                      Pr(>|t|)    
## (Intercept)                           < 2e-16 ***
## CMV_prediction                        0.16128    
## Age                                   < 2e-16 ***
## Sex                                  6.93e-09 ***
## CellTL_Granulocytes                   < 2e-16 ***
## CellTL_Lymphocytes                    < 2e-16 ***
## CellTL_Memory_T-cells                 < 2e-16 ***
## CellTL_Naive_T-cells                  0.08997 .  
## CellTL_NK-cells                       < 2e-16 ***
## CMV_prediction:Sex                    0.03303 *  
## CMV_prediction:CellTL_Granulocytes    0.89124    
## CMV_prediction:CellTL_Lymphocytes    1.47e-08 ***
## CMV_prediction:CellTL_Memory_T-cells  0.00490 ** 
## CMV_prediction:CellTL_Naive_T-cells   0.00182 ** 
## CMV_prediction:CellTL_NK-cells        < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

We observe a significant interaction effect of sex with CMV. The reduction of telomere length if female is higher than if males. We will show this in a plot:

Long_DF_model %>% mutate(`Sex:CMV` = ifelse(CMV_prediction==1 & Sex==2, "CMV_Female", ifelse(CMV_prediction==1 & Sex==1, "CMV_Male", ifelse(CMV_prediction==0 & Sex==1, "Male", "Female") )   )  )  %>% drop_na() %>% ggplot(aes(x=Age, y= TL, col=`Sex:CMV`  )) + geom_point(alpha=0.2, size=1) + geom_smooth(method='lm', formula= y~x, se=F) + theme_bw() + facet_wrap(~Cell) + scale_color_discrete(type = met.brewer(name="Klimt", n=5,type="discrete") )

left_join( select(Covariates, c("ID", "Sex", "Age", "CMV_prediction")) , cell_matrix_clr, by="ID")  %>%
 gather(Cell, value ,5:37) -> Long_DF_model_cell


Association_interaction %>% filter(grepl(":", Regressor)) %>% arrange(`Pr(>|t|)`) %>% filter(FDR_perm2 < 0.2) -> Plot_cell
Long_DF_model_cell %>% filter(Cell %in%  Plot_cell$Phenotype)  %>% mutate(CMV_prediction = as.factor(CMV_prediction)) %>% drop_na() %>% ggplot(aes(x=Age, y= value, col=CMV_prediction )) + geom_point(alpha=0.2, size=1) + geom_smooth(method='lm', formula= y~x, se=F) + theme_bw() + facet_wrap(~Cell, scales = "free") + scale_color_discrete(type = met.brewer(name="Klimt", n=2,type="discrete") ) + ylab("Cell proportion (CLR)")  

And we checked how CMV infection confound TL-sex associations

left_join(Imputed_Telomere_markers, Covariates) -> For_testing
For_testing %>% filter(CMV_prediction == 0) %>% lm(`TL_NK-cells` ~ Age + Sex + PC1+PC2+PC3+PC4+PC5, . ) %>% summary()
## 
## Call:
## lm(formula = `TL_NK-cells` ~ Age + Sex + PC1 + PC2 + PC3 + PC4 + 
##     PC5, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.6770 -0.5435 -0.0455  0.5357  2.5417 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.260272   0.253531  24.692  < 2e-16 ***
## Age         -0.025358   0.002509 -10.107  < 2e-16 ***
## Sex          0.346556   0.061959   5.593 3.14e-08 ***
## PC1          0.067717   0.075882   0.892  0.37247    
## PC2          1.214339   0.412772   2.942  0.00336 ** 
## PC3          0.018915   0.060619   0.312  0.75511    
## PC4         -0.016735   0.059056  -0.283  0.77697    
## PC5         -0.053864   0.057023  -0.945  0.34518    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8251 on 734 degrees of freedom
## Multiple R-squared:  0.2219, Adjusted R-squared:  0.2145 
## F-statistic: 29.91 on 7 and 734 DF,  p-value: < 2.2e-16
For_testing %>% filter(CMV_prediction == 1) %>% lm(`TL_NK-cells` ~ Age + Sex + PC1+PC2+PC3+PC4+PC5, . ) %>% summary()
## 
## Call:
## lm(formula = `TL_NK-cells` ~ Age + Sex + PC1 + PC2 + PC3 + PC4 + 
##     PC5, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.1834 -0.7766 -0.0407  0.6552  3.8188 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.034962   0.257699  27.299  < 2e-16 ***
## Age         -0.033650   0.003957  -8.503 3.35e-16 ***
## Sex          0.018457   0.107413   0.172    0.864    
## PC1          0.030058   0.087654   0.343    0.732    
## PC2         -0.147324   0.187927  -0.784    0.434    
## PC3         -0.020478   0.095723  -0.214    0.831    
## PC4          0.003580   0.089066   0.040    0.968    
## PC5         -0.111518   0.091772  -1.215    0.225    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.057 on 416 degrees of freedom
## Multiple R-squared:  0.157,  Adjusted R-squared:  0.1428 
## F-statistic: 11.07 on 7 and 416 DF,  p-value: 7.516e-13
#The effect of sex disappears in individuals that are CMV positive
table(For_testing$Sex, For_testing$CMV_prediction) %>% chisq.test() #CMV - and CMV + are balanced in sex
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  .
## X-squared = 0.18584, df = 1, p-value = 0.6664
For_testing %>% filter(!is.na(CMV_prediction)) %>% mutate(Sex = ifelse(Sex==1, "Male", "Female"), CMV_prediction= ifelse(CMV_prediction==0, "seronegative", "seropositive") ) %>% ggplot(aes(x=Sex, y=`TL_NK-cells`, col=CMV_prediction )) + geom_boxplot(outlier.shape = NA) + theme_bw() + ggforce::geom_sina(alpha=0.5) + scale_color_manual(values = met.brewer(name="Cassatt1",n =5, type="discrete")[c(1,5)] ) -> Sex_interaction_effect

ggsave("~/Documents/GitHub/CMV_Associations/Results/Figures/CMV_sex_interaction.pdf", Sex_interaction_effect, height = 3, width = 6 )

print(Sex_interaction_effect)

Finally, we will perform mediation analysis to conclude to what extend CMV effect on telomeres is mediated by cell proportios

lambda.grid <- seq(from = 0.4, to = 0.01, by = -0.01)


left_join(left_join(Imputed_Telomere_markers, cell_matrix_xlr, by="ID"), select(Covariates, c(ID, CMV_prediction)), by="ID" ) %>% drop_na() -> Matched_data
for (Column in colnames(Matched_data)){
  if (Column %in% c("ID", "CMV_prediction") ){ next } 
    Matched_data[Column] = scale(Matched_data[Column])
}

x <-  select(Matched_data, CMV_prediction)
y <- select(Matched_data, colnames( select(Telomere_markers, -ID) )  )
med <- select(Matched_data, colnames( select(cell_matrix_xlr, -ID) )  )

fit.grid <-  mvregmed.grid(x, med, y, lambda.grid)
## fitting grid lambda [ 1 ] =  0.4 
## fitting grid lambda [ 2 ] =  0.39 
## fitting grid lambda [ 3 ] =  0.38 
## fitting grid lambda [ 4 ] =  0.37 
## fitting grid lambda [ 5 ] =  0.36 
## fitting grid lambda [ 6 ] =  0.35 
## fitting grid lambda [ 7 ] =  0.34 
## fitting grid lambda [ 8 ] =  0.33 
## fitting grid lambda [ 9 ] =  0.32 
## fitting grid lambda [ 10 ] =  0.31 
## fitting grid lambda [ 11 ] =  0.3 
## fitting grid lambda [ 12 ] =  0.29 
## fitting grid lambda [ 13 ] =  0.28 
## fitting grid lambda [ 14 ] =  0.27 
## fitting grid lambda [ 15 ] =  0.26 
## fitting grid lambda [ 16 ] =  0.25 
## fitting grid lambda [ 17 ] =  0.24 
## fitting grid lambda [ 18 ] =  0.23 
## fitting grid lambda [ 19 ] =  0.22 
## fitting grid lambda [ 20 ] =  0.21 
## fitting grid lambda [ 21 ] =  0.2 
## fitting grid lambda [ 22 ] =  0.19 
## fitting grid lambda [ 23 ] =  0.18 
## fitting grid lambda [ 24 ] =  0.17 
## fitting grid lambda [ 25 ] =  0.16 
## fitting grid lambda [ 26 ] =  0.15 
## fitting grid lambda [ 27 ] =  0.14 
## fitting grid lambda [ 28 ] =  0.13 
## fitting grid lambda [ 29 ] =  0.12 
## fitting grid lambda [ 30 ] =  0.11 
## fitting grid lambda [ 31 ] =  0.1 
## fitting grid lambda [ 32 ] =  0.09 
## fitting grid lambda [ 33 ] =  0.08 
## fitting grid lambda [ 34 ] =  0.07 
## fitting grid lambda [ 35 ] =  0.06 
## fitting grid lambda [ 36 ] =  0.05 
## fitting grid lambda [ 37 ] =  0.04 
## fitting grid lambda [ 38 ] =  0.03 
## fitting grid lambda [ 39 ] =  0.02 
## fitting grid lambda [ 40 ] =  0.01
#summary(fit.grid)
plot.mvregmed.grid(fit.grid)

#Take best model
mvfit.best <- mvregmed.grid.bestfit(fit.grid)

summary(mvfit.best)
## === alpha parameter estimates ===
##                                         mediator              x        alpha
## 1                                    CD24+ CD38+ CMV_prediction -0.004137783
## 2                         CD24+ CD38+ CD27+ IgM+ CMV_prediction  0.017566546
## 3                       CD4+ Naive CD45RA+ CD27+ CMV_prediction -0.009022280
## 4                       CD4+ Naive CD45RO- CD27+ CMV_prediction -0.024410664
## 5                                   CD4+ T cells CMV_prediction  0.015392413
## 6                        CD45RO- CD45RA+ T cells CMV_prediction -0.002716948
## 7                          CD8+ EM CD45RA- CD27- CMV_prediction  0.495692737
## 8                       CD8+ Naive CD45RA+ CD27+ CMV_prediction -0.024908889
## 9                       CD8+ Naive CD45RO- CD27+ CMV_prediction -0.026361211
## 10                                  CD8+ T cells CMV_prediction  0.149407324
## 11                                  Granulocytes CMV_prediction -0.009839873
## 12                                     IgD+ IgM- CMV_prediction  0.003650281
## 13                                   Lymphocytes CMV_prediction  0.064844921
## 14              Memory B cells (IgD+ IgM+ CD27+) CMV_prediction  0.016267710
## 15 Naive mature B cells (CD24+ CD38+ CD27- IgM+) CMV_prediction -0.010636523
## 16                NaiveB cells (IgD+ IgM+ CD27-) CMV_prediction -0.008434609
## 17      Natural effector (CD24+ CD38+ IgD+ IgM+) CMV_prediction  0.025946863
## 18                         NK cells (CD3- CD56+) CMV_prediction -0.003546364
## 19                               Prol CD4+ Tconv CMV_prediction -0.021986996
## 20                          T cells (CD3+ CD56-) CMV_prediction  0.043132867
## 21          Transitional B cells (CD24++ CD38++) CMV_prediction  0.018114967
## === beta parameter estimates ===
##                    y                            mediator         beta
## 1     TL_Lymphocytes              CD24+ CD38+ CD27+ IgM+ -0.001140651
## 2     TL_Lymphocytes            CD4+ Naive CD45RO- CD27+  0.113076267
## 3     TL_Lymphocytes               CD8+ EM CD45RA- CD27- -0.047472650
## 4     TL_Lymphocytes            CD8+ Naive CD45RA+ CD27+  0.101630510
## 5     TL_Lymphocytes                           IgD- CD5+ -0.014771567
## 6     TL_Lymphocytes      NaiveB cells (IgD+ IgM+ CD27-)  0.008481970
## 7     TL_Lymphocytes               NK cells (CD3- CD56+) -0.049822971
## 8     TL_Lymphocytes                NK dim (CD56+ CD16+) -0.002418774
## 9     TL_Lymphocytes                      Prol CD4+ Treg -0.026731458
## 10   TL_Granulocytes                        Granulocytes -0.007172279
## 11   TL_Granulocytes Intermediate monocytes (CD14+CD16+)  0.007636706
## 12   TL_Granulocytes                      Prol CD4+ Treg  0.012422716
## 13  TL_Naive_T-cells            CD4+ Naive CD45RA+ CD27+  0.012693811
## 14  TL_Naive_T-cells            CD4+ Naive CD45RO- CD27+  0.081947918
## 15  TL_Naive_T-cells            CD8+ Naive CD45RA+ CD27+  0.030834223
## 16  TL_Naive_T-cells            CD8+ Naive CD45RO- CD27+  0.127364623
## 17  TL_Naive_T-cells                         Lymphocytes -0.055783714
## 18  TL_Naive_T-cells               NK cells (CD3- CD56+) -0.058013273
## 19  TL_Naive_T-cells                      Prol CD4+ Treg -0.007564126
## 20 TL_Memory_T-cells                        Granulocytes  0.009863018
## 21 TL_Memory_T-cells Intermediate monocytes (CD14+CD16+) -0.003937028
## 22 TL_Memory_T-cells                      Prol CD4+ Treg -0.022189828
## 23        TL_B-cells                      DN (CD4- CD8-)  0.034342805
## 24        TL_B-cells                           IgD- CD5+  0.059624321
## 25        TL_B-cells                           IgD+ CD5+ -0.042800791
## 26        TL_B-cells                                IgM-  0.001572968
## 27        TL_B-cells                      Prol CD4+ Treg  0.010052356
## 28       TL_NK-cells               CD8+ EM CD45RA- CD27- -0.035418474
## 29       TL_NK-cells                        CD8+ T cells  0.002979670
## 30       TL_NK-cells               NK cells (CD3- CD56+)  0.019207644
## === delta parameter estimates ===
##                   y              x       delta
## 1    TL_Lymphocytes CMV_prediction -0.04340894
## 2  TL_Naive_T-cells CMV_prediction -0.00847031
## 3 TL_Memory_T-cells CMV_prediction -0.04607939
## 4       TL_NK-cells CMV_prediction -0.16678170
#Default eps= 0.01 ;set to 0.04, which will show three CMV-telomere relpationships
mvfit.edges <- mvregmed.edges(mvfit.best, eps = 0.01) #Change EPS to show more/less edges
pdf("~/Documents/GitHub/CMV_Associations/Results/Figures/Network_CMV.pdf",)
plot.mvregmed.edges(mvfit.edges, x.color = "palegreen", y.color = "palevioletred", med.color = "skyblue", v.size = 10, seed = 3)
dev.off()
## quartz_off_screen 
##                 2
left_join(Imputed_Telomere_markers, Covariates) %>% 
lm(`TL_NK-cells` ~ CMV_prediction, .) %>% summary()
## 
## Call:
## lm(formula = `TL_NK-cells` ~ CMV_prediction, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.7916 -0.6916 -0.0916  0.6084  4.2148 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     6.29156    0.03718  169.21   <2e-16 ***
## CMV_prediction -0.70641    0.06166  -11.46   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.013 on 1164 degrees of freedom
##   (15 observations deleted due to missingness)
## Multiple R-squared:  0.1013, Adjusted R-squared:  0.1006 
## F-statistic: 131.3 on 1 and 1164 DF,  p-value: < 2.2e-16
#Show some individual mediation
left_join(Telomere_markers, cell_matrix_xlr) %>% left_join(Covariates) %>% dplyr::select( `CD8+ EM CD45RA- CD27-`, CMV_prediction,  `TL_NK-cells`, Age, Sex, `CD8+ T cells`, `NK cells (CD3- CD56+)`  ) %>% drop_na() -> Mediation_input
fit.mediator=lm(`CD8+ EM CD45RA- CD27-`~ CMV_prediction + Age + Sex + `CD8+ T cells` ,  Mediation_input )
fit.dv=lm( `TL_NK-cells` ~ CMV_prediction +`CD8+ EM CD45RA- CD27-` +`CD8+ T cells` + Age + Sex, Mediation_input )

results = mediate(fit.mediator, fit.dv, treat='CMV_prediction', mediator='CD8+ EM CD45RA- CD27-', boot=T)
summary(results) #31%
## 
## Causal Mediation Analysis 
## 
## Nonparametric Bootstrap Confidence Intervals with the Percentile Method
## 
##                Estimate 95% CI Lower 95% CI Upper p-value    
## ACME             -0.160       -0.245        -0.07   0.002 ** 
## ADE              -0.349       -0.503        -0.20  <2e-16 ***
## Total Effect     -0.509       -0.646        -0.37  <2e-16 ***
## Prop. Mediated    0.315        0.143         0.52   0.002 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Sample Size Used: 970 
## 
## 
## Simulations: 1000
fit.mediator2=lm(`CD8+ T cells`~ CMV_prediction + Age + Sex + `CD8+ EM CD45RA- CD27-` ,  Mediation_input )
fit.dv2=lm( `TL_NK-cells` ~ CMV_prediction +`CD8+ T cells` + Age + Sex + `CD8+ EM CD45RA- CD27-`, Mediation_input )
results2 = mediate(fit.mediator2, fit.dv2, treat='CMV_prediction', mediator='CD8+ T cells', boot=T)
summary(results2) #no significant
## 
## Causal Mediation Analysis 
## 
## Nonparametric Bootstrap Confidence Intervals with the Percentile Method
## 
##                Estimate 95% CI Lower 95% CI Upper p-value    
## ACME            0.00298     -0.01369         0.02    0.72    
## ADE            -0.34884     -0.49732        -0.19  <2e-16 ***
## Total Effect   -0.34586     -0.49658        -0.19  <2e-16 ***
## Prop. Mediated -0.00862     -0.06916         0.04    0.72    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Sample Size Used: 970 
## 
## 
## Simulations: 1000
fit.mediator3=lm(`NK cells (CD3- CD56+)` ~ CMV_prediction + Age + Sex ,  Mediation_input )
fit.dv3=lm( `TL_NK-cells` ~ CMV_prediction +`NK cells (CD3- CD56+)`  + Age + Sex, Mediation_input )
results3 = mediate(fit.mediator3, fit.dv3, treat='CMV_prediction', mediator='NK cells (CD3- CD56+)', boot=T)
summary(results3) #No significant mediation
## 
## Causal Mediation Analysis 
## 
## Nonparametric Bootstrap Confidence Intervals with the Percentile Method
## 
##                Estimate 95% CI Lower 95% CI Upper p-value    
## ACME           -0.00161     -0.01016         0.01     0.7    
## ADE            -0.53964     -0.68098        -0.41  <2e-16 ***
## Total Effect   -0.54126     -0.68176        -0.41  <2e-16 ***
## Prop. Mediated  0.00298     -0.01004         0.02     0.7    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Sample Size Used: 970 
## 
## 
## Simulations: 1000

We explore if CMV infection affects correlations between biomarkers

Compute_heterogeneity = function(C1, C2, Sample_size1=dim(S1)[1], Sample_size2=dim(S2)[1] ){

  #compute heterogeneity Pvalue using Fisher's z-test 
  #Fisher's z-test is used to compare the difference between two correlation coefficients. It assumes that the correlation coefficients are normally distributed after being transformed using Fisher's z-transformation.
  C1 %>% as.data.frame() %>%  rownames_to_column("Marker1") %>% as_tibble() %>% gather(Marker2, Correlation ,2:(dim(C1)[2]+1) ) %>% filter(! Correlation == 1) ->  C1_long
  C2 %>% as.data.frame() %>%  rownames_to_column("Marker1") %>% as_tibble() %>% gather(Marker2, Correlation ,2:(dim(C1)[2]+1) ) %>% filter(! Correlation == 1) -> C2_long
  left_join(C1_long, C2_long, by = c("Marker1", "Marker2"), suffix=c("_CMV", "_noCMV")) -> DF
  
  Done = c()
  Results_meta =tibble()
  for (line in seq(nrow(DF))){
    L = DF[line,]
    Name= paste0(sort(c(L$Marker1, L$Marker2)), collapse="|")
    #if (Name == "MetaboAge|MetaboHealth" ){ break }
    if (Name %in% Done ){ next 
    } else { Done = c(Done, Name) } 
    
    z1 = atanh(L$Correlation_CMV)
    z2 = atanh(L$Correlation_noCMV)
    se_diff <- sqrt(1/(Sample_size1 - 3) + 1/(Sample_size2 - 3))
  
    z_stat <- (z1 - z2) / se_diff
    p_value <- 2 * (1 - pnorm(abs(z_stat)))
    
    tibble(N =Name, Z_stat= z_stat, P=p_value, Rho_CMV1=L$Correlation_CMV, Rho_CMV0=L$Correlation_noCMV, N_CMV1=Sample_size1, N_CMV0=Sample_size2 ) %>% rbind(Results_meta, .) -> Results_meta
  }
  Results_meta %>% arrange(desc(abs(Z_stat))) %>% return()

}
Plot_heatmaps = function(C1, C2, Output_name1="Results/Figures/Aging_heatmap_CMVpositive.png", Output_name2="Results/Figures/Aging_heatmap_CMVnegative.png" ){
  C1 %>% pheatmap() -> heatmap1
  C1[heatmap1$tree_row$order, heatmap1$tree_col$order]  %>% pheatmap(cluster_rows=F, cluster_cols=F, display_numbers = T, main = "CMV_positive") -> heatmap1.2
  C2[heatmap1$tree_row$order, heatmap1$tree_col$order] %>% pheatmap(cluster_rows=F, cluster_cols=F, display_numbers = T, main = "CMV_negative") -> heatmap2
  #save
  ggsave( paste0(Location_dir, Output_name1), plot = heatmap1.2, dpi = 300)
  ggsave( paste0(Location_dir, Output_name2), plot = heatmap2, dpi = 300)
}


Aging_health_markers %>% dplyr::select(-ID) %>% drop_na() %>% cor() %>% pheatmap()

#For CMV positive
Aging_health_markers %>% filter(ID %in%  filter(Covariates, CMV_prediction == 1)$ID ) %>% drop_na() ->S1
S1 %>% dplyr::select(-ID) %>% drop_na() %>% cor()  -> C1


#For CMV negative
Aging_health_markers %>% filter(ID %in%  filter(Covariates, CMV_prediction == 0)$ID ) %>% drop_na() -> S2
S2 %>% dplyr::select(-ID)  %>% cor() -> C2

Plot_heatmaps(C1,C2)

## Saving 7 x 5 in image
## Saving 7 x 5 in image

Compute_heterogeneity(C1, C2) -> Diff_noresid


#Use Age-residualized data
Remove_variability(Aging_health_markers, Covariates, Covariates_n = c("Age") ) -> Aging_health_markers2
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(Covariates_n)` instead of `Covariates_n` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
## Joining, by = "ID"Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(Feature)` instead of `Feature` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
Aging_health_markers2 %>% filter(ID %in%  filter(Covariates, CMV_prediction == 1)$ID ) %>% drop_na() ->S3
S3 %>% dplyr::select(-ID)  %>% cor()  -> C3
Aging_health_markers2 %>% filter(ID %in%  filter(Covariates, CMV_prediction == 0)$ID ) %>% drop_na() ->S4
S4 %>% dplyr::select(-ID)  %>% cor()  -> C4

Plot_heatmaps(C3,C4, "Results/Figures/ResidAging_heatmap_CMVpositive.png", "Results/Figures/ResidAging_heatmap_CMVnegative.png")

## Saving 7 x 5 in image
## Saving 7 x 5 in image

Compute_heterogeneity(C3, C4, dim(S3)[1], dim(S4)[1] ) -> Diff_resid


left_join(Diff_noresid, Diff_resid, by="N", suffix=c("","_resid")) %>% mutate(Confounded_by_age = abs(Z_stat_resid-Z_stat) ) -> Heterogeneity

write_tsv(Heterogeneity, paste0(Location_dir, "Results/Heterogeneity_confounding_CMV_Aging.tsv" ))

Next, we will replicate the TL assocations. 300BCG is a cohort with both young and old participants. TLs were measured in the participants, and there is serology for EBV and CMV. The TLs were measured by qPCR. They reflect the average telomere length per chromosome end. It is corrected for genome numbers with a single-copy reference gene. You can find more details in the kit’s manual: https://sciencellonline.com/PS/8918.pdf. Meaning that it is an average between many cell types, and the effects might be weaker.

read_excel(paste0(Location_dir,"Data/300BCG_Telomere_Sergio.xlsx"), sheet = "Old") %>% rename(PatientID=`Subject ID`, TL=`Visit 1 Length`, CMV1 = `CMV stat...7`, CMV2=`CMV stat...9`   ) -> TL_old 
## New names:
read_excel(paste0(Location_dir,"Data/300BCG_Telomere_Sergio.xlsx"), sheet = "Young") %>% rename(PatientID=`Subject ID`, TL=`Visit 1 Length`, CMV1 = `CMV stat...7`, CMV2=`CMV stat...9`   ) -> TL_young
## New names:
rbind(TL_old %>% mutate(Age_group = "old"), TL_young %>% mutate(Age_group = "young") ) -> TL_replication
TL_replication %>% mutate(CMV1 = ifelse(CMV1=="-", 0, 1 ), CMV2 =ifelse(CMV2=="Negative", 0, 1 )  ) -> TL_replication
TL_replication %>% filter(!CMV1 == CMV2)
## # A tibble: 2 × 11
##   Patien…¹   Sex Age      TL >> av…² EBV s…³  CMV1 >> fr…⁴  CMV2 >> fr…⁵ Age_g…⁶
##      <dbl> <dbl> <chr> <dbl> <lgl>   <chr>   <dbl> <lgl>   <dbl> <lgl>   <chr>  
## 1       10     1 20     5.75 NA      -           1 NA          0 NA      young  
## 2      174     0 20     8.63 NA      +           0 NA          1 NA      young  
## # … with abbreviated variable names ¹​PatientID,
## #   ²​`>> average telomere length per chromosome end (kb)`, ³​`EBV stat`,
## #   ⁴​`>> from the file "2019 300BCG serologies_for_sergio"`,
## #   ⁵​`>> from the file "CMV IgG index 300BCG"`, ⁶​Age_group
TL_replication %>% group_by(CMV1) %>% summarise(n())
## # A tibble: 2 × 2
##    CMV1 `n()`
##   <dbl> <int>
## 1     0    74
## 2     1    24
TL_replication %>% ggplot(aes(TL)) + geom_histogram() + theme_bw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

TL_replication %>% ggplot(aes(x = Age_group , y = TL )) + geom_boxplot() + theme_bw()

TL_replication %>% ggplot(aes(x = as.numeric(Age) )) + geom_histogram() + theme_bw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

lm(TL  ~ Age_group + Sex + CMV1, TL_replication) %>% summary() #We see a negative trend with CMV infection, however is not significant
## 
## Call:
## lm(formula = TL ~ Age_group + Sex + CMV1, data = TL_replication)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.2596 -1.8305 -0.0573  1.3456  8.7296 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      6.5933     0.7590   8.686 1.14e-13 ***
## Age_groupyoung   2.9147     0.7255   4.018 0.000118 ***
## Sex              1.1430     0.5686   2.010 0.047262 *  
## CMV1            -0.6932     0.6798  -1.020 0.310519    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.805 on 94 degrees of freedom
## Multiple R-squared:  0.1979, Adjusted R-squared:  0.1723 
## F-statistic:  7.73 on 3 and 94 DF,  p-value: 0.000114
lm(TL  ~ Age_group + Sex*CMV1, TL_replication) %>% summary() #We also see a negative interaction in females, however, not significant
## 
## Call:
## lm(formula = TL ~ Age_group + Sex * CMV1, data = TL_replication)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.2666 -1.8233 -0.0721  1.3423  8.7512 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     6.58960    0.76777   8.583 2.03e-13 ***
## Age_groupyoung  2.91115    0.73377   3.967 0.000143 ***
## Sex             1.15712    0.65676   1.762 0.081380 .  
## CMV1           -0.66334    0.96685  -0.686 0.494364    
## Sex:CMV1       -0.05835    1.33713  -0.044 0.965284    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.82 on 93 degrees of freedom
## Multiple R-squared:  0.1979, Adjusted R-squared:  0.1634 
## F-statistic: 5.736 on 4 and 93 DF,  p-value: 0.0003593
TL_replication %>% ggplot( aes(x=as.factor(CMV1), y= TL ) ) + geom_boxplot(outlier.shape = NA) + ggforce::geom_sina() + theme_bw()

lm(TL  ~ CMV1, TL_replication ) %>% summary()
## 
## Call:
## lm(formula = TL ~ CMV1, data = TL_replication)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.2640 -1.9817 -0.4878  1.7002 10.3465 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   9.6153     0.3545  27.121   <2e-16 ***
## CMV1         -1.2744     0.7164  -1.779   0.0784 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.05 on 96 degrees of freedom
## Multiple R-squared:  0.03191,    Adjusted R-squared:  0.02183 
## F-statistic: 3.164 on 1 and 96 DF,  p-value: 0.07843